#Packages
library(tidyverse)
library(stringr)
library(readr)
library(sperrorest)

#Download district datasets
d_09 <- read_csv("C:/Users/marka/Dropbox/Policy Representation CCES/Intermediate Data/Census Data/nhgis0076_csv/nhgis0076_csv/nhgis0076_ds146_2000_cd109th.csv") %>%
  left_join(read_csv("C:/Users/marka/Dropbox/Policy Representation CCES/Intermediate Data/Census Data/nhgis0076_csv/nhgis0076_csv/nhgis0076_ds151_2000_cd109th.csv"))
d_10_12 <- read_csv("C:/Users/marka/Dropbox/Policy Representation CCES/Intermediate Data/Census Data/nhgis0068_csv/nhgis0068_csv/nhgis0068_ds184_20115_cd110th-112th.csv")
d_13_14 <- read_csv("C:/Users/marka/Dropbox/Policy Representation CCES/Intermediate Data/Census Data/nhgis0068_csv/nhgis0068_csv/nhgis0068_ds215_20155_cd113th-114th.csv")
d_15 <- read_csv("C:/Users/marka/Dropbox/Policy Representation CCES/Intermediate Data/Census Data/nhgis0068_csv/nhgis0068_csv/nhgis0068_ds233_20175_cd115th.csv")
d_16 <- read_csv("C:/Users/marka/Dropbox/Policy Representation CCES/Intermediate Data/Census Data/nhgis0068_csv/nhgis0068_csv/nhgis0068_ds249_20205_cd116th.csv")
d_16_ed <- read_csv("C:/Users/marka/Dropbox/Policy Representation CCES/Intermediate Data/Census Data/nhgis0068_csv/nhgis0068_csv/nhgis0068_ds254_20215_cd116th.csv")

#Clean district datasets
d_09 <- d_09  %>%
  rowwise() %>%
  mutate(congress = "110th",
  d_pop = sum(c_across(FMZ001:FMZ046)),
  d_vap = sum(c_across(FMZ005:FMZ023)) + sum(c_across(FMZ028:FMZ046)),
  d_p_under35 = sum(c_across(FMZ005:FMZ010)) + sum(c_across(FMZ028:FMZ033)),
  d_p_35to49 = sum(c_across(FMZ011:FMZ013)) + sum(c_across(FMZ034:FMZ036)),
  d_p_50to64 = sum(c_across(FMZ014:FMZ017)) + sum(c_across(FMZ037:FMZ040)),
  d_p_over65 = sum(c_across(FMZ018:FMZ023)) + sum(c_across(FMZ041:FMZ046)),
  d_p_black = (FMS002+FMS009)/d_pop,
  d_p_white = FMS001/d_pop,
  d_p_api = (FMS004+FMS005+FMS011+FMS012)/d_pop,
  d_p_latino = sum(c_across(FMS008:FMS014))/d_pop,
  d_p_college = (sum(c_across(GKT013:GKT016))+sum(c_across(GKT029:GKT032)))/(sum(c_across(FMZ009:FMZ023)) + sum(c_across(FMZ032:FMZ046))), #prop over the age of 25
  d_p_hs = (sum(c_across(GKT009:GKT016))+sum(c_across(GKT025:GKT032)))/(sum(c_across(FMZ009:FMZ023)) + sum(c_across(FMZ032:FMZ046))),
  d_medinc = GMY001, 
  d_num =  CD109A,
  YEAR =  "2006"
  ) %>%
  rename(NAME_E = NAME) %>%
  select(c(GISJOIN:STUSAB,  NAME_E , congress:d_num))


#110th-112th congress
d_10_12 <- d_10_12  %>%
  rowwise() %>%
  mutate(congress = "110th, 111th, 112th",
    d_pop = MNTE001,
    d_vap = sum(c_across(MNIE007:MNIE025))+sum(c_across(MNIE031:MNIE049)),
    d_p_under35 = (sum(c_across(MNIE007:MNIE012))+sum(c_across(MNIE031:MNIE036)))/d_pop,
    d_p_35to49 = (sum(c_across(MNIE013:MNIE015))+sum(c_across(MNIE037:MNIE039)))/d_pop,
    d_p_50to64 = (sum(c_across(MNIE016:MNIE019))+sum(c_across(MNIE040:MNIE043)))/d_pop,
    d_p_over65 = (sum(c_across(MNIE020:MNIE025))+sum(c_across(MNIE044:MNIE049)))/d_pop, 
    d_p_black = (MN2E004+MN2E014)/d_pop,
    d_p_white = MN2E003/d_pop,
    d_p_api = (MN2E006+MN2E007+MN2E016+MN2E017)/d_pop,
    d_p_latino = MN2E012/d_pop,
    d_p_college = (sum(c_across(MPSE015:MPSE018))+sum(c_across(MPSE032:MPSE035)))/MPSE001,
    d_p_hs = (sum(c_across(MPSE011:MPSE018))+sum(c_across(MPSE028:MPSE035)))/MPSE001,
    d_medinc = MP1E001, 
    d_num =  CDCURRA
    ) %>%
  select(c(GISJOIN:STUSAB, GEOID, NAME_E , congress:d_num))

#113th-114th congress
d_13_14 <- d_13_14  %>%
  rowwise() %>%
  mutate(congress = "113th, 114th",
    d_pop = ADKWE001,
         d_vap = sum(c_across(ADKLE007:ADKLE025))+sum(c_across(ADKLE031:ADKLE049)),
         d_p_under35 = (sum(c_across(ADKLE007:ADKLE012))+sum(c_across(ADKLE031:ADKLE036)))/d_pop,
         d_p_35to49 = (sum(c_across(ADKLE013:ADKLE015))+sum(c_across(ADKLE037:ADKLE039)))/d_pop,
         d_p_50to64 = (sum(c_across(ADKLE016:ADKLE019))+sum(c_across(ADKLE040:ADKLE043)))/d_pop,
         d_p_over65 = (sum(c_across(ADKLE020:ADKLE025))+sum(c_across(ADKLE044:ADKLE049)))/d_pop, 
         d_p_black = (ADK5E004+ADK5E014)/d_pop,
         d_p_white = ADK5E003/d_pop,
         d_p_api = (ADK5E006+ADK5E007+ADK5E016+ADK5E017)/d_pop,
         d_p_latino = ADK5E012/d_pop,
         d_p_college = sum(c_across(ADMZE022:ADMZE025))/ADMZE001,
         d_p_hs = sum(c_across(ADMZE017:ADMZE025))/ADMZE001,
         d_medinc = ADNKE001, 
    d_num =  CDCURRA
  ) %>%
  select(c(GISJOIN:STUSAB, GEOID, NAME_E , congress:d_num))

#115th congress
d_15 <- d_15  %>%
  rowwise() %>%
  mutate(congress = "115th",
         d_pop = AHY1E001,
         d_vap = sum(c_across(AHYQE007:AHYQE025))+sum(c_across(AHYQE031:AHYQE049)),
         d_p_under35 = (sum(c_across(AHYQE007:AHYQE012))+sum(c_across(AHYQE031:AHYQE036)))/d_pop,
         d_p_35to49 = (sum(c_across(AHYQE013:AHYQE015))+sum(c_across(AHYQE037:AHYQE039)))/d_pop,
         d_p_50to64 = (sum(c_across(AHYQE016:AHYQE019))+sum(c_across(AHYQE040:AHYQE043)))/d_pop,
         d_p_over65 = (sum(c_across(AHYQE020:AHYQE025))+sum(c_across(AHYQE044:AHYQE049)))/d_pop, 
         d_p_black = (AHZAE004+AHZAE014)/d_pop,
         d_p_white = AHZAE003/d_pop,
         d_p_api = (AHZAE006+AHZAE007+AHZAE016+AHZAE017)/d_pop,
         d_p_latino = AHZAE012/d_pop,
         d_p_college = sum(c_across(AH04E022:AH04E025))/AH04E001,
         d_p_hs = sum(c_across(AH04E017:AH04E025))/AH04E001,
         d_medinc = AH1PE001, 
         d_num =  CDCURRA
  ) %>%
  select(c(GISJOIN:STUSAB, GEOID, NAME_E , congress:d_num))

#116th congress
d_16_17 <- d_16  %>%
  inner_join(d_16_ed[,c(1,42:92)], by = c("GISJOIN")) %>%
  rowwise() %>%
  mutate(congress = "116th, 117th",
         d_pop = AMPVE001,
         d_vap = sum(c_across(AMPKE007:AMPKE025))+sum(c_across(AMPKE031:AMPKE049)),
         d_p_under35 = (sum(c_across(AMPKE007:AMPKE012))+sum(c_across(AMPKE031:AMPKE036)))/d_pop,
         d_p_35to49 = (sum(c_across(AMPKE013:AMPKE015))+sum(c_across(AMPKE037:AMPKE039)))/d_pop,
         d_p_50to64 = (sum(c_across(AMPKE016:AMPKE019))+sum(c_across(AMPKE040:AMPKE043)))/d_pop,
         d_p_over65 = (sum(c_across(AMPKE020:AMPKE025))+sum(c_across(AMPKE044:AMPKE049)))/d_pop, 
         d_p_black = (AMP3E004+AMP3E014)/d_pop,
         d_p_white = AMP3E003/d_pop,
         d_p_api = (AMP3E006+AMP3E007+AMP3E016+AMP3E017)/d_pop,
         d_p_latino = AMP3E012/d_pop,
         d_p_college = sum(c_across(AOP8E022:AOP8E025))/AOP8E001,
         d_p_hs = sum(c_across(AOP8E017:AOP8E025))/AOP8E001,
         d_medinc = AMR8E001, 
         d_num =  CDCURRA
  ) %>%
  select(c(GISJOIN:STUSAB, GEOID, NAME_E , congress:d_num))

#Creating one per congress for later join
d_10 <- d_10_12 %>% mutate(congress = "110th")
d_11 <- d_10_12 %>% mutate(congress = "111th")
d_12 <- d_10_12 %>% mutate(congress = "112th")
d_13 <- d_13_14 %>% mutate(congress = "113th")
d_14 <- d_13_14 %>% mutate(congress = "114th")
d_16 <- d_16_17 %>% mutate(congress = "116th")
d_17 <- d_16_17 %>% mutate(congress = "117th")

#Binding rows
d <- bind_rows(d_09, d_10, d_11, d_12, d_13, d_14, d_15, d_16, d_17)
hist(d$d_p_black)

#### State data #####

#Downloading state data for ACS (not race)
s08 <- read_csv("C:/Users/marka/Dropbox/Policy Representation CCES/Intermediate Data/Census Data/nhgis0070_csv/nhgis0070_csv/nhgis0070_ds176_20105_state.csv") %>%
  rowwise() %>%
  mutate(s_pop = JMAE001,
         s_vap =  sum(c_across(JLZE007:JLZE025))+sum(c_across(JLZE031:JLZE049)),
         s_p_under35 = (sum(c_across(JLZE007:JLZE012))+sum(c_across(JLZE031:JLZE036)))/s_pop,
         s_p_35to49 = (sum(c_across(JLZE013:JLZE015))+sum(c_across(JLZE037:JLZE039)))/s_pop,
         s_p_50to64 = (sum(c_across(JLZE016:JLZE019))+sum(c_across(JLZE040:JLZE043)))/s_pop,
         s_p_over65 = (sum(c_across(JLZE020:JLZE025))+sum(c_across(JLZE044:JLZE049)))/s_pop,
         s_p_hs = (sum(c_across(JN9E011:JN9E018))+sum(c_across(JN9E028:JN9E035)))/JN9E001,
         s_p_college = (sum(c_across(JN9E015:JN9E018))+sum(c_across(JN9E032:JN9E035)))/JN9E001,
         s_medinc = JOIE001,
         YEAR = 2010) %>%
  select(c(GISJOIN, YEAR, STATE, s_pop:s_medinc)) 


s21 <- read_csv("C:/Users/marka/Dropbox/Policy Representation CCES/Intermediate Data/Census Data/nhgis0070_csv/nhgis0070_csv/nhgis0070_ds254_20215_state.csv") %>%
  rowwise() %>%
  mutate(s_pop = AON4E001,
         s_vap =  sum(c_across(AONTE007:AONTE025))+sum(c_across(AONTE031:AONTE049)),
         s_p_under35 = (sum(c_across(AONTE007:AONTE012))+sum(c_across(AONTE031:AONTE036)))/s_pop,
         s_p_35to49 = (sum(c_across(AONTE013:AONTE015))+sum(c_across(AONTE037:AONTE039)))/s_pop,
         s_p_50to64 = (sum(c_across(AONTE016:AONTE019))+sum(c_across(AONTE040:AONTE043)))/s_pop,
         s_p_over65 = (sum(c_across(AONTE020:AONTE025))+sum(c_across(AONTE044:AONTE049)))/s_pop,
         s_p_hs = sum(c_across(AOP8E017:AOP8E025))/AOP8E001,
         s_p_college = sum(c_across(AOP8E022:AOP8E025))/AOP8E001,
         s_medinc = AOQIE001,
         YEAR = 2021) %>%
  select(c(GISJOIN, YEAR, STATE, s_pop:s_medinc)) 

#Binding data
s <- bind_rows(s08, s21)

#Filling out in between years
s <- s %>% 
  group_by(STATE, GISJOIN) %>% 
  complete(YEAR = full_seq(2006:2022, 1)) %>%
  arrange(YEAR)

#interpolating population data
m1 <- lm(s_pop ~ as.factor(STATE) * as.numeric(YEAR), s)
s$s_pop <- predict(m1, newdata=remove_missing_levels (fit=m1, test_data=s))

#Merging in race data
s <- read_csv("C:/Users/marka/Dropbox/Policy Representation CCES/Intermediate Data/Census Data/nhgis0069_csv/nhgis0069_csv/nhgis0069_ts_geog2010_state.csv") %>%
  rename(YEAR = DATAYEAR) %>%
  right_join(s, by = c("GISJOIN", "YEAR", "STATE")) %>%
  mutate(s_p_black = CW7AB/s_pop,
         s_p_white = CW7AA/s_pop,
         s_p_api = CW7AD/s_pop,
         s_p_latino = (CW7AG + CW7AH + CW7AI + CW7AJ + CW7AK + CW7AL)/s_pop) %>%
  select(c(GISJOIN, YEAR, STATE, s_pop:s_medinc, s_p_black:s_p_latino)) 

#Interpolating


m2 <- lm(s_vap ~ as.factor(STATE) * as.numeric(YEAR), s)
s$s_vap <- predict(m2, newdata=remove_missing_levels (fit=m2, test_data=s))

m3 <- lm(s_p_under35 ~ as.factor(STATE) * as.numeric(YEAR), s)
s$s_p_under35 <- predict(m3, newdata=remove_missing_levels (fit=m3, test_data=s))

m3.2 <- lm(s_p_35to49 ~ as.factor(STATE) * as.numeric(YEAR), s)
s$s_p_35to49 <- predict(m3.2, newdata=remove_missing_levels (fit=m3.2, test_data=s))

m3.3 <- lm(s_p_50to64 ~ as.factor(STATE) * as.numeric(YEAR), s)
s$s_p_50to64 <- predict(m3.2, newdata=remove_missing_levels (fit=m3.3, test_data=s))

m4 <- lm(s_p_over65 ~ as.factor(STATE) * as.numeric(YEAR), s)
s$s_p_over65 <- predict(m4, newdata=remove_missing_levels (fit=m4, test_data=s))

m5 <- lm(s_p_hs ~ as.factor(STATE) * as.numeric(YEAR), s)
s$s_p_hs <- predict(m5,newdata=remove_missing_levels (fit=m5, test_data=s))

m6 <- lm(s_p_college ~ as.factor(STATE) * as.numeric(YEAR), s)
s$s_p_college <- predict(m6,newdata=remove_missing_levels (fit=m6, test_data=s))

m7 <- lm(s_medinc ~ as.factor(STATE) * as.numeric(YEAR), s)
s$s_medinc <- predict(m7,newdata=remove_missing_levels (fit=m7, test_data=s))

m9 <- lm(s_p_black ~ as.factor(STATE) * as.numeric(YEAR), s)
s$s_p_black <- predict(m9,newdata=remove_missing_levels (fit=m9, test_data=s))

m10 <- lm(s_p_white ~ as.factor(STATE) * as.numeric(YEAR), s)
s$s_p_white <- predict(m10,newdata=remove_missing_levels (fit=m10, test_data=s))

m11 <- lm(s_p_latino ~ as.factor(STATE) * as.numeric(YEAR), s)
s$s_p_latino <- predict(m11,newdata=remove_missing_levels (fit=m11, test_data=s))

m12 <- lm(s_p_api ~ as.factor(STATE) * as.numeric(YEAR), s)
s$s_p_api <- predict(m12,newdata=remove_missing_levels (fit=m12, test_data=s))

#Filtering out non-states
s <- filter(s, STATE != "District of Columbia" & STATE != "District Of Columbia" & STATE != "Puerto Rico")


#Saving Data
write.csv(d, "C:/Users/marka/Dropbox/Policy Representation CCES/Intermediate Data/Census Data/DistrictCensusData_CCES_070323.csv")
write.csv(s, "C:/Users/marka/Dropbox/Policy Representation CCES/Intermediate Data/Census Data/StateCensusData_CCES_070323.csv")

